In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
In [2]:
df = pd.read_csv('data/college_financials.csv', header=0)
In [10]:
yr4 = df#.query('sector in (1,2,3)')
len(yr4)
Out[10]:
87560
In [11]:
pd.value_counts(yr4['academic_year']).sort_index().plot.bar()
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0xe098fd0>
In [12]:
yr4['namecity'] = yr4['inst_name'] + yr4['city'] + yr4['state']
schoolrecords = pd.value_counts(yr4['namecity'])
In [13]:
pd.value_counts(schoolrecords).sort_index().plot.bar()
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xbfbedd8>
In [14]:
joined = yr4.join(schoolrecords, on='namecity', lsuffix='yr4', rsuffix='cnt')
In [15]:
final_df = joined[joined['namecitycnt'] == 13].drop('namecitycnt', axis=1)
In [16]:
len(final_df)
Out[16]:
46540
In [17]:
for c in final_df.columns:
    if final_df[c].dtype in (object, np.int64):
        print('skipping {}'.format(c))
        continue
    if '_pct' not in c and 'index' not in c and 'scalar' not in c:
        # ajust for inflation
        print('adjusting {} for inflation'.format(c))
        final_df[c] = final_df[c] / df['hepi_scalar_2012']
    final_df['{}_prev'.format(c)] = final_df.groupby('namecityyr4')[c].shift(1)
    final_df['{}_change'.format(c)] = (final_df[c] - final_df['{}_prev'.format(c)]) / final_df['{}_prev'.format(c)]
skipping academic_year
skipping inst_name
skipping tcs_name
skipping city
skipping state
skipping zipcode
skipping sector
adjusting gross_tuition_fees_rev for inflation
adjusting net_tuition_fees_rev for inflation
adjusting rev_fed_grant for inflation
adjusting rev_state_grant for inflation
adjusting rev_local_grant for inflation
adjusting rev_state_local_grant_contract for inflation
adjusting rev_fedapp_grant_contract for inflation
adjusting investment for inflation
adjusting total_rev for inflation
adjusting grant01 for inflation
adjusting grant02 for inflation
adjusting grant03 for inflation
adjusting grant04 for inflation
adjusting grant05 for inflation
adjusting grant06 for inflation
adjusting grant07 for inflation
adjusting institutional_grant_aid for inflation
adjusting institutional_grant_aid_share for inflation
adjusting tuition_discount for inflation
adjusting fed_grant_avg_amount for inflation
adjusting state_grant_num for inflation
adjusting inst_grant_avg_amount for inflation
adjusting loan_avg_amount for inflation
adjusting tuition01_tf for inflation
adjusting tuition02_tf for inflation
adjusting tuition03_tf for inflation
adjusting tuition05_tf for inflation
adjusting tuition06_tf for inflation
adjusting tuition07_tf for inflation
adjusting total_exp for inflation
adjusting total_eandg for inflation
adjusting total_eandg_adj for inflation
adjusting eandr for inflation
adjusting eandr_degree for inflation
adjusting average_subsidy for inflation
adjusting gross_operating_margin for inflation
adjusting total_assets for inflation
adjusting total_liabilities for inflation
adjusting total_net_assets for inflation
adjusting total_faculty_all for inflation
adjusting all_employees for inflation
adjusting ft_faculty_salary for inflation
skipping namecityyr4
In [52]:
final_df[final_df['inst_name'] == 'Stanford University']
Out[52]:
academic_year inst_name tcs_name city state zipcode sector cpi_index cpi_scalar_2012 hepi_index ... total_liabilities_prev total_liabilities_change total_net_assets_prev total_net_assets_change total_faculty_all_prev total_faculty_all_change all_employees_prev all_employees_change ft_faculty_salary_prev ft_faculty_salary_change
25103 2000 Stanford University Stanford University Stanford CA 94305 2 169.300 0.743963 196.9 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25104 2001 Stanford University Stanford University Stanford CA 94305 2 175.100 0.769450 208.7 ... 3.429076e+09 0.025139 1.744974e+10 -0.102507 2559.729814 NaN 12261.090916 NaN 152271.155196 NaN
25105 2002 Stanford University Stanford University Stanford CA 94305 2 178.200 0.783073 212.7 ... 3.515278e+09 -0.130399 1.566103e+10 -0.069486 NaN NaN NaN NaN NaN NaN
25106 2003 Stanford University Stanford University Stanford CA 94305 2 182.100 0.800211 223.5 ... 3.056889e+09 0.337409 1.457280e+10 0.039845 3433.762108 NaN 12952.078991 NaN 153271.510094 -0.000989
25107 2004 Stanford University Stanford University Stanford CA 94305 2 186.100 0.817788 231.7 ... 4.088310e+09 -0.070816 1.515346e+10 0.092331 NaN NaN NaN NaN 153119.895680 0.002667
25108 2005 Stanford University Stanford University Stanford CA 94305 2 191.700 0.842397 240.8 ... 3.798794e+09 0.058136 1.655259e+10 0.157132 5376.809669 NaN 15229.443249 NaN 153528.241878 -0.212865
25109 2006 Stanford University Stanford University Stanford CA 94305 2 199.000 0.874475 253.1 ... 4.019639e+09 -0.017962 1.915353e+10 0.112912 NaN NaN NaN NaN 120847.508189 0.010683
25110 2007 Stanford University Stanford University Stanford CA 94305 2 204.138 0.897054 260.3 ... 3.947440e+09 0.121474 2.131619e+10 0.160277 8355.794546 NaN 17091.555904 NaN 122138.554161 -0.004217
25111 2008 Stanford University Stanford University Stanford CA 94305 2 211.702 0.930292 273.2 ... 4.426952e+09 -0.027148 2.473269e+10 -0.015403 NaN NaN NaN NaN 121623.442204 0.072087
25112 2009 Stanford University Stanford University Stanford CA 94305 2 214.658 0.943282 279.3 ... 4.306770e+09 0.129166 2.435173e+10 -0.222363 7982.509517 NaN 16881.537335 NaN 130390.857837 -0.026142
25113 2010 Stanford University Stanford University Stanford CA 94305 2 216.735 0.952409 281.8 ... 4.863057e+09 0.095085 1.893679e+10 0.067823 NaN NaN NaN NaN 126982.209333 0.003790
25114 2011 Stanford University Stanford University Stanford CA 94305 2 221.087 0.971533 288.4 ... 5.325460e+09 -0.018239 2.022115e+10 0.133971 8256.004261 NaN 17088.420162 NaN 127463.515250 0.000106
25115 2012 Stanford University Stanford University Stanford CA 94305 2 227.565 1.000000 293.2 ... 5.228328e+09 0.047439 2.293018e+10 0.025086 NaN NaN NaN NaN 127477.001535 0.031589

13 rows × 167 columns

In [18]:
final_df.to_csv('data/college_financials_only13.csv')
In [54]:
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
for c in final_df.columns:
    logged = False
    if final_df[c].dtype in (object, np.int64) or '_prev' in c or '_change' in c or 'cpi' in c or 'hepi' in c or 'heca' in c: 
        continue
    temp_df = final_df[['{}_prev'.format(c), '{}_change'.format(c), 'academic_year', 'sector']].copy()
    if '_pct' not in c:
        temp_df['{}_prev'.format(c)] = temp_df['{}_prev'.format(c)].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
        logged = True
    lm = sns.lmplot('{}_prev'.format(c),
                    '{}_change'.format(c),
                    data = temp_df,
                   fit_reg=False,
                   hue = 'academic_year', col='sector', sharex=False)
    lm.axes[0,0].set_ylim([-1,2])
    lm.axes[0,1].set_ylim([-1,2])
    lm.axes[0,2].set_ylim([-1,2])
    if '_pct' in c:
        lm.axes[0,0].set_xlim([0,100])
        lm.axes[0,1].set_xlim([0,100])
        lm.axes[0,2].set_xlim([0,100])
C:\ProgramData\Miniconda3\lib\site-packages\matplotlib\pyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [86]:
for c in final_df.columns:
    logged = False
    if final_df[c].dtype in (object, np.int64) or '_prev' in c or '_change' in c or 'cpi' in c or 'hepi' in c or 'heca' in c: 
        continue
    temp_df = final_df[['{}_prev'.format(c), c, 'academic_year', 'sector']].copy()
    if '_pct' not in c:
        temp_df['{}_prev'.format(c)] = temp_df['{}_prev'.format(c)].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
        temp_df[c] = temp_df[c].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
        logged = True
    lm = sns.lmplot('{}_prev'.format(c),c,
                    data = temp_df,
                    fit_reg = False,
                   hue = 'academic_year', col='sector', sharex=False)
    if '_pct' in c:
        for i in range(3):
            lm.axes[0,i].set_xlim([0,100])
            lm.axes[0,i].set_ylim([0,100])
    xlims = lm.axes[0,0].get_xlim()
    ylims = lm.axes[0,0].get_ylim()
    start = np.maximum(xlims[0], ylims[0])
    end = np.minimum(xlims[1], ylims[1])
    for i in range(3):
        lm.axes[0,i].plot([start, end], [start,end])
        lm.axes[0,i].set_xlim([start,end])
        lm.axes[0,i].set_ylim([start,end])
C:\ProgramData\Miniconda3\lib\site-packages\matplotlib\pyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [88]:
def dig_deep(col):
    temp_df = final_df[['academic_year', 'sector',col, '{}_prev'.format(col)]].copy()
    temp_df[col] = temp_df[col].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
    temp_df['{}_prev'.format(col)] = temp_df['{}_prev'.format(col)].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
    lm = sns.lmplot('{}_prev'.format(col), col, fit_reg=False, col = 'sector', row='academic_year', data=temp_df, scatter=True)

    for j in range(13):
        xlims = lm.axes[j,0].get_xlim()
        ylims = lm.axes[j,0].get_ylim()
        start = np.maximum(xlims[0], ylims[0])
        end = np.minimum(xlims[1], ylims[1])
        for i in range(3):
            lm.axes[j,i].plot([start, end], [start,end])
            lm.axes[j,i].set_xlim([start, end])
            lm.axes[j,i].set_ylim([start, end])
In [89]:
dig_deep('investment')
        
In [90]:
dig_deep('grant02')
In [91]:
dig_deep('institutional_grant_aid')